### Replication code for "Family Matters?"
### Summer 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu

# this script contains the code to produce SI figures SI11, SI12, and SI20, so it's clear how they were produced. However, the code requires additional data to run; email me if you're interested in running this code locally and I can send them over.

# start with SI11 and SI12 (neighborhood incarceration rates)

#########################################################
# first, pull in shapefiles 
#########################################################

library(rgdal)
smallneighborsproj <- smallneighborsfull2[firstcase>"2011-12-31",] # now, focusing on main dataset

##Read in census tract shapefiles
#wait, or can I pull down shapefiles with census data attached? See here. https://www.census.gov/geo/maps-data/data/tiger-data.html
library(rgdal)
currwd <- getwd()
setwd(paste(currwd,"/censustracts_withcensusdata2010",sep="")) 
Harristracts <- readOGR(dsn="./", layer = "Tract_2010Census_DP1")
setwd(paste(currwd))
summary(Harristracts); proj4string(Harristracts)
coordinates(smallneighborsproj) <- c("Longitude", "Latitude")
proj4string(smallneighborsproj) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") #from above

##first, match all 2011 defendants to tracts
#pull in defendants and pare down again
load("Harrisdefendants20002012_Arcgeocoded.Rdata")
harrismatch <- geodef12flat
#trim down to Harris county addresses? using zipcodes set above
harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] #this should also trim out most non-matches for now.
harrismatch <- harrismatch[harrismatch$Score >48,] #but now drop all non-matched (but keep score cutoff pretty low for now?)

harrismatch <- harrismatch[harrismatch$crt<16,] # and for now keep only misdemeanants too.
harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")
defs11 <- harrismatch[harrismatch$firstcase_date >= "2011-01-01" & harrismatch$firstcase_date <= "2011-12-31",]
length(unique(defs11$def_spn)); dim(defs11)

#clean up/subset as needed.
colnames(defs11)[colnames(defs11) == "coords.x2"] <- "Latitude" 
colnames(defs11)[colnames(defs11) == "coords.x1"] <- "Longitude" 
#give defendants coordinates
defs11_unproj <- defs11
coordinates(defs11) <- c("Longitude", "Latitude")
proj4string(defs11) <- CRS(" +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") 

#now try sticking defs in tracts
defsT1 <- spTransform(defs11, CRS(proj4string(Harristracts)))
joindefs <- over(defsT1, Harristracts)
names(joindefs)
head(joindefs)
dim(joindefs); dim(Harristracts)
#okay, so now there's a row for each defendant, which isn't what I want to end up with.  hmm.
#want to collapse so for each tractID, I have a measure of how many arrested and jailed in 2011
GEOID10 <- as.data.frame(joindefs$GEOID10)
deftracts <- as.data.frame(cbind(defs11_unproj, GEOID10))
colnames(deftracts)
head(deftracts)
colnames(deftracts)[colnames(deftracts) == "joindefs$GEOID10"] <- "GEOID10" 
deftracts <- data.table(deftracts)
deftracts[, arrested:= 1]
tractdefs <- deftracts[, list(arrested = sum(arrested), jailed= sum(anyjail) ), by=GEOID10]
dim(tractdefs); head(tractdefs)

##Now, assign voters to tracts
proj4string(smallneighborsproj)
harrissampleT1 <- spTransform(smallneighborsproj, CRS(proj4string(Harristracts)))
join1 <- over(harrissampleT1, Harristracts)
names(join1)

#okay, now I just need to go into the codebook and see what any of these variables are and then relabel/include them.
#what do I want? pop density, percent black, pct foreign born, poverty rate
#what can I actually get from these files short of merging in other data? total pop, avg age, pct black/latino, some stuff about rental mkt/housing tenure.
colnames(join1)[colnames(join1) == "DP0010001"] <- "tract10_totalpop" 
colnames(join1)[colnames(join1) == "DP0100002"] <- "tract10_totallatino" 
colnames(join1)[colnames(join1) == "DP0090002"] <- "tract10_totalblack" 
colnames(join1)[colnames(join1) == "DP0160001"] <- "tract10_avgHHsize" 
colnames(join1)[colnames(join1) == "DP0210001"] <- "tract10_housingtenure" #wait, I don't know what this is.

#need to join this to the original dataset.
dim(smallneighborsfull2); dim(join1)
smallneighborscensus <- cbind(smallneighborsfull2, join1)
dim(smallneighborscensus)
summary(smallneighborscensus$tract10_totalpop)
summary(smallneighborscensus$tract10_totallatino)
summary(smallneighborscensus$tract10_totalblack)
summary(smallneighborscensus$tract10_avgHHsize)
summary(smallneighborscensus$tract10_housingtenure)
smallneighborscensus[,pctlatino:=tract10_totallatino/tract10_totalpop ]
smallneighborscensus[,pctblack:=tract10_totalblack/tract10_totalpop ]

#now figure out avg arrest/incarc rates for voters, and think about how to subset
#so first merge in tractdefs from above. note that most tracts actually won't have anybody who got arrested/jailed in 2011 (though most families probably fall in places that did, especially those matched to 2011 cases, obvs)
sncensus <- merge(smallneighborscensus,tractdefs, by="GEOID10", all.x=T)
dim(sncensus); dim(smallneighborscensus); dim(tractdefs)

sncensus[, jailper1k := jailed / (tract10_totalpop / 1000) ]; summary(sncensus$jailper1k)
sncensus[is.na(jailper1k)==T, jailper1k := 0]
sncensus[, arrestper1k := arrested / (tract10_totalpop / 1000) ]; summary(sncensus$arrestper1k)
sncensus[is.na(arrestper1k)==T, arrestper1k := 0]

#now shorten it to times after 2011
sncen<- sncensus[firstcase>"2011-12-31", ]; dim(sncen)

#########################################################
# now rerun main analysis, splitting by rates
#########################################################


#now above/below med
ed2012 <- as.Date("2012-11-06") #election day

sncenbelowmed[,chargebefore:=0]; sncenbelowmed[firstcase < "2012-11-06" & anyconv==0,chargebefore:=1] 
sncenbelowmed[,convbefore:=0]; sncenbelowmed[firstcase < "2012-11-06" & anyconv==1 & anyjail==0,convbefore:=1] 
sncenbelowmed[,jailbefore:=0]; sncenbelowmed[firstcase < "2012-11-06" & anyconv==1 & anyjail==1,jailbefore:=1] 

weeks <- 1*40
storage <- as.data.frame(matrix(NA, nrow=weeks, ncol=13))
for (i in 2:weeks){
	lower <- ed2012 - i*7; higher <- ed2012 + i*7 #add/subtract off days to get the window
	window <- sncenbelowmed[firstcase < higher & firstcase > lower, ]
	#window[,chargebefore:=0]; window[firstcase < "2012-11-06",chargebefore:=1] #set up some treatment-timing indicators (these are above now)
	#window <- window[!(firstcase > ed2012-3 & firstcase < ed2012+1), ] #drop the few days just prior to election, since that's hard to attribute treatment
	ba1 <- lm(vote2012 ~ chargebefore+vote2010+vote2008+vote2006+vote2004, data=window[anyconv==0]); summary(ba1) #est. just-charge model
	ba1.vcovCL<-cluster.vcov(ba1, window[anyconv==0]$def_spn)
	ba1conv <- lm(vote2012 ~ convbefore+vote2010+vote2008+vote2006+vote2004, data=window[anyconv==1 & anyjail==0,]); summary(ba1) #just-conv
	ba1conv.vcovCL<-cluster.vcov(ba1conv, window[anyconv==1& anyjail==0,]$def_spn)
	ba1jail <- lm(vote2012 ~ jailbefore+vote2010+vote2008+vote2006+vote2004, data=window[anyjail==1,]); summary(ba1) #jail
	ba1jail.vcovCL<-cluster.vcov(ba1jail, window[anyjail==1,]$def_spn)
	storage[i,1] <- nrow(window) 
	storage[i,2] <- est <- coeftest(ba1, ba1.vcovCL)[2,1]
	storage[i,3] <- p <- coeftest(ba1, ba1.vcovCL)[2,4]
	err<- coeftest(ba1, ba1.vcovCL)[2,2]; dof <- ba1$df
	storage[i,4] <- est + -1*err*qt(0.975, dof) 
	storage[i,5] <- est + 1*err*qt(0.975, dof) 
	storage[i,6] <- est <- coeftest(ba1conv, ba1conv.vcovCL)[2,1]
	storage[i,7] <- p <- coeftest(ba1conv, ba1conv.vcovCL)[2,4]
	err<- coeftest(ba1conv, ba1conv.vcovCL)[2,2]; dof <- ba1conv$df
	storage[i,8] <- est + -1*err*qt(0.975, dof) 
	storage[i,9] <- est + 1*err*qt(0.975, dof) 
	storage[i,10] <- est <- coeftest(ba1jail, ba1jail.vcovCL)[2,1]
	storage[i,11] <- p <- coeftest(ba1jail, ba1jail.vcovCL)[2,4]
	err<- coeftest(ba1jail, ba1jail.vcovCL)[2,2]; dof <- ba1jail$df
	storage[i,12] <- est + -1*err*qt(0.975, dof) 
	storage[i,13] <- est + 1*err*qt(0.975, dof) 
}
colnames(storage) <- c("n", "case_est", "case_p", "case_lowCI", "case_highCI", "conv_est", "conv_p", "conv_lowCI", "conv_highCI", "jail_est", "jail_p", "jail_lowCI", "jail_highCI")
storage$row <- 1:nrow(storage)

pdf("beforeafter_belowmedincarc.pdf")
par(mfrow=c(1,3))
plot(storage$row, storage$case_est, ylim=c(-.3, .1), pch=19,main= "Charge", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$case_lowCI, storage$row, storage$case_highCI, col="dodgerblue4")
#dev.off()

plot(storage$row, storage$conv_est, ylim=c(-.3, .1), pch=19, main= "Conviction", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$conv_lowCI, storage$row, storage$conv_highCI, col="dodgerblue4")
#dev.off()

plot(storage$row, storage$jail_est, ylim=c(-.3, .1), pch=19, main= "Jail", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$jail_lowCI, storage$row, storage$jail_highCI, col="dodgerblue4")
dev.off()



sncenabovemed[,chargebefore:=0]; sncenabovemed[firstcase < "2012-11-06" & anyconv==0,chargebefore:=1] 
sncenabovemed[,convbefore:=0]; sncenabovemed[firstcase < "2012-11-06" & anyconv==1 & anyjail==0,convbefore:=1] 
sncenabovemed[,jailbefore:=0]; sncenabovemed[firstcase < "2012-11-06" & anyconv==1 & anyjail==1,jailbefore:=1] 

weeks <- 1*40
storage <- as.data.frame(matrix(NA, nrow=weeks, ncol=13))
for (i in 2:weeks){
	lower <- ed2012 - i*7; higher <- ed2012 + i*7 #add/subtract off days to get the window
	window <- sncenabovemed[firstcase < higher & firstcase > lower, ]
	#window[,chargebefore:=0]; window[firstcase < "2012-11-06",chargebefore:=1] #set up some treatment-timing indicators (these are above now)
	#window <- window[!(firstcase > ed2012-3 & firstcase < ed2012+1), ] #drop the few days just prior to election, since that's hard to attribute treatment
	ba1 <- lm(vote2012 ~ chargebefore+vote2010+vote2008+vote2006+vote2004, data=window[anyconv==0]); summary(ba1) #est. just-charge model
	ba1.vcovCL<-cluster.vcov(ba1, window[anyconv==0]$def_spn)
	ba1conv <- lm(vote2012 ~ convbefore+vote2010+vote2008+vote2006+vote2004, data=window[anyconv==1 & anyjail==0,]); summary(ba1) #just-conv
	ba1conv.vcovCL<-cluster.vcov(ba1conv, window[anyconv==1& anyjail==0,]$def_spn)
	ba1jail <- lm(vote2012 ~ jailbefore+vote2010+vote2008+vote2006+vote2004, data=window[anyjail==1,]); summary(ba1) #jail
	ba1jail.vcovCL<-cluster.vcov(ba1jail, window[anyjail==1,]$def_spn)
	storage[i,1] <- nrow(window) 
	storage[i,2] <- est <- coeftest(ba1, ba1.vcovCL)[2,1]
	storage[i,3] <- p <- coeftest(ba1, ba1.vcovCL)[2,4]
	err<- coeftest(ba1, ba1.vcovCL)[2,2]; dof <- ba1$df
	storage[i,4] <- est + -1*err*qt(0.975, dof) 
	storage[i,5] <- est + 1*err*qt(0.975, dof) 
	storage[i,6] <- est <- coeftest(ba1conv, ba1conv.vcovCL)[2,1]
	storage[i,7] <- p <- coeftest(ba1conv, ba1conv.vcovCL)[2,4]
	err<- coeftest(ba1conv, ba1conv.vcovCL)[2,2]; dof <- ba1conv$df
	storage[i,8] <- est + -1*err*qt(0.975, dof) 
	storage[i,9] <- est + 1*err*qt(0.975, dof) 
	storage[i,10] <- est <- coeftest(ba1jail, ba1jail.vcovCL)[2,1]
	storage[i,11] <- p <- coeftest(ba1jail, ba1jail.vcovCL)[2,4]
	err<- coeftest(ba1jail, ba1jail.vcovCL)[2,2]; dof <- ba1jail$df
	storage[i,12] <- est + -1*err*qt(0.975, dof) 
	storage[i,13] <- est + 1*err*qt(0.975, dof) 
}
colnames(storage) <- c("n", "case_est", "case_p", "case_lowCI", "case_highCI", "conv_est", "conv_p", "conv_lowCI", "conv_highCI", "jail_est", "jail_p", "jail_lowCI", "jail_highCI")
storage$row <- 1:nrow(storage)

pdf("beforeafter_abovemedincarc.pdf")
par(mfrow=c(1,3))
plot(storage$row, storage$case_est, ylim=c(-.3, .1), pch=19,main= "Charge", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$case_lowCI, storage$row, storage$case_highCI, col="dodgerblue4")
#dev.off()

plot(storage$row, storage$conv_est, ylim=c(-.3, .1), pch=19, main= "Conviction", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$conv_lowCI, storage$row, storage$conv_highCI, col="dodgerblue4")
#dev.off()

plot(storage$row, storage$jail_est, ylim=c(-.3, .1), pch=19, main= "Jail", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$jail_lowCI, storage$row, storage$jail_highCI, col="dodgerblue4")
dev.off()


#above/below median? top/bottom quartiles?
medianarrest <- median(sncen$arrestper1k)
arrest25 <- quantile(sncen$arrestper1k, .25)
arrest75 <- quantile(sncen$arrestper1k, .75)

medianjail <- median(sncen$jailper1k)
jail25 <- quantile(sncen$jailper1k, .25)
jail75 <- quantile(sncen$jailper1k, .75)

cor(sncen$jailper1k, sncen$arrestper1k, use = "complete.obs")
#v. v. close, maybe just go with jail

sncenabovemed <- sncen[jailper1k > medianjail, ]; dim(sncenabovemed)
sncenbelowmed <- sncen[jailper1k <= medianjail, ]; dim(sncenabovemed)
sncenabove75 <- sncen[jailper1k > jail75, ]; dim(sncenabove75)
sncenbelow25 <- sncen[jailper1k < jail25, ]; dim(sncenbelow25)


##################################
# finally, remake figure SI19 (the jail RDD one) but with 2016 voting instead: SI20
# (This code won't run without the recent voter file, but contact me if you want it)
# need to merge in that 2017 voter file (just have Harris County, so not perfect).

load("harrisvfile2016.Rdata") #got this from Nationbuilder
small2 <- merge(smallneighborsfull2, harrisvoters16, by="state_file_id")
dim(small2); dim(smallneighborsfull2) #how many get lost here?
dim(small2[fyear>2009,]); dim(smallneighborsfull2[fyear>2009,]) # about 6k
small2 <- merge(smallneighborsfull2, harrisvoters16, by="state_file_id", all.x=T)
dim(small2); dim(smallneighborsfull2)

small2[, vote2016 := 0] #now figure voting
small2[vote_method.General.2016=="voted" & is.na(vote_method.General.2016)==F, vote2016 := 1]
sum(small2$vote2016, na.rm=T) 

checkweeks <- c(-40:40)
storage <- as.data.frame(matrix(NA, nrow=length(checkweeks), ncol=2))
for (i in 1:nrow(storage)){
	week <- checkweeks[i]
	lower <- ed2012 + week*7; higher <- lower+7 #add/subtract off days to get the window
	window <- small2[firstcase < higher & firstcase >= lower & anyjail==1, ]
	#now just take some pretreatment characteristic means for this week.
	storage[i,1] <- week
	storage[i,2] <- mean(window$vote2016)
}

colnames(storage) <- c("week", "vote2016")
storage$row <- 1:nrow(storage)
pdf("turnout2016_jailed_loess_40weeks.pdf")
plot(storage$week, storage$vote2016, main= "2012 Turnout by Household Members", ylim=c(.25,.65), xlab="Case Timing: Weeks From 2012 Election", ylab="Proportion of Registrants Voting, Nov. 2016", cex.main=1.5, cex.axis=1.5,cex=1.5, cex.lab=1.45)
#hmm?
abline(v=0, lty=2)
vote.lo <- loess(storage$vote2016 ~ storage$week)
j <- order(storage$week)
lines(storage$week[j],vote.lo$fitted[j],col="darkgray",lty=2)
#fit two separate smoothers
vote.lo1 <- loess(storage$vote2016[1:40] ~ storage$week[1:40])
j <- order(storage$week)
lines(storage$week[j],vote.lo1$fitted[j],col="red",lwd=3)
vote.lo2 <- loess(storage$vote2016[41:81] ~ storage$week[41:81])
j <- order(storage$week)[41:81]
lines(storage$week[41:81],vote.lo2$fitted,col="red",lwd=3)
dev.off() #Figure SI20


